From 87eebf8dd5ec4564aa1cfca4fe7e53fbd29da3d5 Mon Sep 17 00:00:00 2001
From: "C. Scott Ananian"
Date: Thu, 8 Jan 2015 17:00:54 -0500
Subject: [PATCH] Support IPv6 URLs in bracketed and auto links.
The corresponding patch for Parsoid is
Ibb33188cdfe2004e469c3f6ee6f30d34d1923283.
Task: T23261
Change-Id: Iff077bf31168b431febb243e2e62f2c6502616bc
---
includes/Sanitizer.php | 5 +++
includes/parser/Parser.php | 22 +++++++---
tests/parser/parserTests.txt | 85 ++++++++++++++++++++++++++++++++++--
3 files changed, 101 insertions(+), 11 deletions(-)
diff --git a/includes/Sanitizer.php b/includes/Sanitizer.php
index 387f24faa0..30981c368f 100644
--- a/includes/Sanitizer.php
+++ b/includes/Sanitizer.php
@@ -1809,6 +1809,11 @@ class Sanitizer {
$host = preg_replace( $strip, '', $host );
+ // IPv6 host names are bracketed with []. Url-decode these.
+ if ( substr_compare( "//%5B", $host, 0, 5 ) === 0 && preg_match( '!^//%5B(.*?)%5D((:\d+)?)$!', $host, $matches ) ) {
+ $host = '//[' . $matches[1] . ']' . $matches[2];
+ }
+
// @todo FIXME: Validate hostnames here
return $protocol . $host . $rest;
diff --git a/includes/parser/Parser.php b/includes/parser/Parser.php
index 6189997a92..2eec08bf91 100644
--- a/includes/parser/Parser.php
+++ b/includes/parser/Parser.php
@@ -87,7 +87,11 @@ class Parser {
# \p{Zs} is unicode 'separator, space' category. It covers the space 0x20
# as well as U+3000 is IDEOGRAPHIC SPACE for bug 19052
const EXT_LINK_URL_CLASS = '[^][<>"\\x00-\\x20\\x7F\p{Zs}]';
- const EXT_IMAGE_REGEX = '/^(http:\/\/|https:\/\/)([^][<>"\\x00-\\x20\\x7F\p{Zs}]+)
+ # Simplified expression to match an IPv4 or IPv6 address, or
+ # at least one character of a host name (embeds EXT_LINK_URL_CLASS)
+ const EXT_LINK_ADDR = '(?:[0-9.]+|\\[(?i:[0-9a-f:.]+)\\]|[^][<>"\\x00-\\x20\\x7F\p{Zs}])';
+ # RegExp to make image URLs (embeds IPv6 part of EXT_LINK_ADDR)
+ const EXT_IMAGE_REGEX = '/^(http:\/\/|https:\/\/)((?:\\[(?i:[0-9a-f:.]+)\\])?[^][<>"\\x00-\\x20\\x7F\p{Zs}]+)
\\/([A-Za-z0-9_.,~%\\-+&;#*?!=()@\\x80-\\xFF]+)\\.((?i)gif|png|jpg|jpeg)$/Sxu';
# Regular expression for a non-newline space
@@ -254,7 +258,8 @@ class Parser {
$this->mConf = $conf;
$this->mUrlProtocols = wfUrlProtocols();
$this->mExtLinkBracketedRegex = '/\[(((?i)' . $this->mUrlProtocols . ')' .
- self::EXT_LINK_URL_CLASS . '+)\p{Zs}*([^\]\\x00-\\x08\\x0a-\\x1F]*?)\]/Su';
+ self::EXT_LINK_ADDR .
+ self::EXT_LINK_URL_CLASS . '*)\p{Zs}*([^\]\\x00-\\x08\\x0a-\\x1F]*?)\]/Su';
if ( isset( $conf['preprocessorClass'] ) ) {
$this->mPreprocessorClass = $conf['preprocessorClass'];
} elseif ( defined( 'HPHP_VERSION' ) ) {
@@ -1378,6 +1383,7 @@ class Parser {
public function doMagicLinks( $text ) {
$prots = wfUrlProtocolsWithoutProtRel();
$urlChar = self::EXT_LINK_URL_CLASS;
+ $addr = self::EXT_LINK_ADDR;
$space = self::SPACE_NOT_NL; # non-newline space
$spdash = "(?:-|$space)"; # a dash or a non-newline space
$spaces = "$space++"; # possessive match of 1 or more spaces
@@ -1386,7 +1392,7 @@ class Parser {
(].*?) | # m[1]: Skip link text
(<.*?>) | # m[2]: Skip stuff inside
# HTML elements' . "
- (\b(?i:$prots)($urlChar+)) | # m[3]: Free external links
+ (\b(?i:$prots)($addr$urlChar*)) | # m[3]: Free external links
# m[4]: Post-protocol path
\b(?:RFC|PMID) $spaces # m[5]: RFC or PMID, capture number
([0-9]+)\b |
@@ -1499,14 +1505,14 @@ class Parser {
$url = substr( $url, 0, -$numSepChars );
}
- $url = Sanitizer::cleanUrl( $url );
-
# Verify that we still have a real URL after trail removal, and
# not just lone protocol
if ( strlen( $trail ) >= $numPostProto ) {
return $url . $trail;
}
+ $url = Sanitizer::cleanUrl( $url );
+
# Is this an external image?
$text = $this->maybeMakeExternalImage( $url );
if ( $text === false ) {
@@ -5415,9 +5421,10 @@ class Parser {
case 'gallery-internal-link':
$linkValue = strip_tags( $this->replaceLinkHoldersText( $match ) );
$chars = self::EXT_LINK_URL_CLASS;
+ $addr = self::EXT_LINK_ADDR;
$prots = $this->mUrlProtocols;
//check to see if link matches an absolute url, if not then it must be a wiki link.
- if ( preg_match( "/^($prots)$chars+$/u", $linkValue ) ) {
+ if ( preg_match( "/^($prots)$addr$chars*$/u", $linkValue ) ) {
$link = $linkValue;
} else {
$localLinkTitle = Title::newFromText( $linkValue );
@@ -5599,13 +5606,14 @@ class Parser {
break;
case 'link':
$chars = self::EXT_LINK_URL_CLASS;
+ $addr = self::EXT_LINK_ADDR;
$prots = $this->mUrlProtocols;
if ( $value === '' ) {
$paramName = 'no-link';
$value = true;
$validated = true;
} elseif ( preg_match( "/^((?i)$prots)/", $value ) ) {
- if ( preg_match( "/^((?i)$prots)$chars+$/u", $value, $m ) ) {
+ if ( preg_match( "/^((?i)$prots)$addr$chars*$/u", $value, $m ) ) {
$paramName = 'link-url';
$this->mOutput->addExternalLink( $value );
if ( $this->mOptions->getExternalLinkTarget() ) {
diff --git a/tests/parser/parserTests.txt b/tests/parser/parserTests.txt
index ffa435c08b..f6ca577186 100644
--- a/tests/parser/parserTests.txt
+++ b/tests/parser/parserTests.txt
@@ -5341,14 +5341,91 @@ http://example.com/index.php?foozoid[]=bar
!! end
!! test
-IPv6 urls (bug 21261)
-!! options
-disabled
+IPv6 urls, autolink format (T23261)
!! wikitext
http://[2404:130:0:1000::187:2]/index.php
+
+Examples from RFC2373, section 2.2:
+* http://[1080::8:800:200C:417A]/unicast
+* http://[FF01::101]/multicast
+* http://[::1]/loopback
+* http://[::]/unspecified
+* http://[::13.1.68.3]/ipv4compat
+* http://[::FFFF:129.144.52.38]/ipv4compat
+
+Examples from RFC 2732, section 2:
+* http://[FEDC:BA98:7654:3210:FEDC:BA98:7654:3210]:80/index.html
+* http://[1080:0:0:0:8:800:200C:417A]/index.html
+* http://[3ffe:2a00:100:7031::1]
+* http://[1080::8:800:200C:417A]/foo
+* http://[::192.9.5.5]/ipng
+* http://[::FFFF:129.144.52.38]:80/index.html
+* http://[2010:836B:4179::836B:4179]
+
!! html
http://[2404:130:0:1000::187:2]/index.php
-
+
Examples from RFC2373, section 2.2:
+
+
+Examples from RFC 2732, section 2:
+
+
+
+!! end
+
+!! test
+IPv6 urls, bracketed format (T23261)
+!! wikitext
+[http://[2404:130:0:1000::187:2]/index.php test]
+
+Examples from RFC2373, section 2.2:
+* [http://[1080::8:800:200C:417A] unicast]
+* [http://[FF01::101] multicast]
+* [http://[::1]/ loopback]
+* [http://[::] unspecified]
+* [http://[::13.1.68.3] ipv4compat]
+* [http://[::FFFF:129.144.52.38] ipv4compat]
+
+Examples from RFC 2732, section 2:
+* [http://[FEDC:BA98:7654:3210:FEDC:BA98:7654:3210]:80/index.html 1]
+* [http://[1080:0:0:0:8:800:200C:417A]/index.html 2]
+* [http://[3ffe:2a00:100:7031::1] 3]
+* [http://[1080::8:800:200C:417A]/foo 4]
+* [http://[::192.9.5.5]/ipng 5]
+* [http://[::FFFF:129.144.52.38]:80/index.html 6]
+* [http://[2010:836B:4179::836B:4179] 7]
+
+!! html
+test
+
Examples from RFC2373, section 2.2:
+
+
+Examples from RFC 2732, section 2:
+
+
+
!! end
!! test
--
2.20.1